Warning : We are now using DFam consensus !
TE consensus sequences are contained in D_mel_transposon_sequence_set.fa (downloaded from https://github.com/bergmanlab/transposons/ ) By aligning reads to these sequences, we can identify which TEs are expressed and get countings.
Commands :
~/softs/minimap2/minimap2 -ax splice ~/cristina_TE/data/consensus/D_mel_transposon_sequence_set.fa ~/cristina_TE/data/reads/FC29.fastq.gz | samtools sort -o FC29.against_consensus.bam
~/softs/minimap2/minimap2 -ax splice ~/cristina_TE/data/consensus/D_mel_transposon_sequence_set.fa ~/cristina_TE/data/reads/FC30.fastq.gz | samtools sort -o FC30.against_consensus.bam
We get two bam files as a result (FC29 : 5.2Go, FC30 : 1.8Go )
Here's an example of what's in D_mel_transposon_sequence_set.fa :
>1731#LTR/Copia
TGTTGAATATAGGCAATGCCCACATGTGTGTTGAATATAGGCAATTTCCACATGTGCATA
TGTAATTTTGTATGAGAACATACATACATACACATGAACTGTATGTATGTATATATATTA
And here's the beginning of the BAM we get by aligning our reads to the TEs consensus sequences :
e1bae963-07b0-44fa-8a67-ae90f86c4ba0 2064 Tc3#DNA/Tc1-Mariner 331 38 710H32M5D19M2D8M1D5M1D18M1D4M1D124M1384H * 0
0 GGTTTTACGCTTCTAACTTGACTTCTTGTTTGTTAAATCTCGAAAGTTAAATTCTTTTGATTCTAAATATAAATTATCTTTTTAATTTTTTCTCAAATGGTCCGCGAAAAGTCTTTATCCGATTTTGAAAAAGGTCAAATCAAAGGCTATATTGAATCTGGTTTAAAACACTGTGTAATAGCCAAGAAAATCGGTTGAAGTCAAAACGTT 53/a:/-$7)4/($,%"/$#''-2#"$-5'"''$#'&$%'$"+2%4)*+/)(:?-L/$)2(('#/7+(,$+.#&%(%#&'0-%+(*16995,0.*?;3;9;+*+*/(--)&"%()%8<89210OG?*)NPMPKGC)6>:>*/H5?;-%3/2B4.DLL./A8<:*'A>-"1%#)0255A<*5H2.N(JQ@=J+?@A+-6L7K4)KT1=@WR NM:i:35 ms:i:115 AS:i:115 nn:i:0 tp:A:P cm:i:9 s1:i:64 s2:i:0 de:f:0.1389 SA:Z:McClintock#LTR/Gypsy,592,+,1229S153M2I920S,60,11; rl:i:0
This read generate two alignments : one primary (in McClintock#LTR/Gypsy) and one supplementary (in Tc3#DNA/Tc1-Mariner). The alignment score is better in the primary. In both case, the alignment covers a very small part of the consensus sequence...
Bergman consensus gtf contained very small sequences of TE... Trying again with new TE sequences from DFAM ([https://dfam.org/browse?clade=7215&clade_descendants=true])
Downloading those sequences using FASTA button at the bottom of the page...
Those TE sequences are often fragmented in 2 parts : the intern part and the LTR part. We need to match the LTR part at the beginning and the end of the intern part.
Two possibilities to match intern and LTR part : {TE_name}_I / {TE_name}_LTR OR {TE_name}-I_DM / {TE_name}-LTR_DM
I wrote a tool to generate a consensus fasta file with merged intern and ltr part here : https://github.com/EricAmren/TE_LTR_flanker
import pysam
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import dtale
import subprocess
import HTSeq
# FC29_bamfile = "data/consensus/bam/FC29.against_consensus.bam"
# FC30_bamfile = "data/consensus/bam/FC30.against_consensus.bam"
# consensus_fasta_file = "data/consensus/D_mel_transposon_sequence_set.fa"
FC29_bamfile = "data/consensus/bam/FC29.against_DFam_consensus.bam"
FC30_bamfile = "data/consensus/bam/FC30.against_DFam_consensus.bam"
consensus_fasta_file = "data/consensus/families.flanked_LTR.hierarchy.fa"
unmapped = 0
supp = 0
secondary = 0
primary = 0
with pysam.AlignmentFile(FC29_bamfile, 'rb') as bam1:
for ali in bam1:
if ali.is_supplementary:
supp += 1
elif ali.is_unmapped:
unmapped += 1
elif ali.is_secondary:
secondary += 1
else:
primary += 1
total = sum([supp, unmapped, secondary, primary])
male_counts = [total, primary, secondary, supp, unmapped]
unmapped = 0
supp = 0
secondary = 0
primary = 0
with pysam.AlignmentFile(FC30_bamfile, 'rb') as bam1:
for ali in bam1:
if ali.is_supplementary:
supp += 1
elif ali.is_unmapped:
unmapped += 1
elif ali.is_secondary:
secondary += 1
else:
primary += 1
total = sum([supp, unmapped, secondary, primary])
female_counts = [total, primary, secondary, supp, unmapped]
counts_list = [female_counts, male_counts]
counts_df = pd.DataFrame(counts_list, columns=["Total", "Primary", "Secondary", "Supplementary", "Unmapped"])
counts_df.insert(1, "Sex", ["Female", "Male"], True)
counts_df.set_index("Sex")
Nombre de reads s'alignant sur le premier fichier de consensus. (On a globalement pas bougé)
| Sex | Total | Primary | Secondary | Supplementary | Unmapped |
|---|---|---|---|---|---|
| Female | 1236291 | 1241 | 262 | 29 | 1234759 |
| Male | 2926979 | 7372 | 1000 | 425 | 2918182 |
99.7% of those reads were mapped against dmgoth genome.
ratio_mapped_female = counts_df["Primary"][0]/counts_df["Total"][0]
print("Ratio of read mapped to ET (Female) = " + str(ratio_mapped_female))
ratio_mapped_male = counts_df["Primary"][1]/counts_df["Total"][1]
print("Ratio of read mapped to ET (Male) = " + str(ratio_mapped_male))
FC29_alignments = pysam.AlignmentFile(FC29_bamfile, 'rb')
read_index = pysam.IndexedReads(FC29_alignments)
read_index.build()
def get_countings_from_bam(bamfile):
counting_dict = dict()
alignment_file = pysam.AlignmentFile(bamfile, 'rb')
for ali in alignment_file:
if not ali.is_secondary and not ali.is_supplementary and not ali.is_unmapped :
if not ali.reference_name in counting_dict :
counting_dict[ali.reference_name] = 0
counting_dict[ali.reference_name] += 1
return counting_dict
consensus_counting_female_dict = get_countings_from_bam(FC30_bamfile)
consensus_counting_male_dict = get_countings_from_bam(FC29_bamfile)
full_consensus_counting_dict = dict()
for feature in consensus_counting_female_dict:
if not feature in consensus_counting_male_dict :
full_consensus_counting_dict[feature] = [consensus_counting_female_dict[feature], 0]
else :
full_consensus_counting_dict[feature] = [consensus_counting_female_dict[feature], consensus_counting_male_dict[feature]]
for feature in consensus_counting_male_dict:
if not feature in consensus_counting_female_dict :
full_consensus_counting_dict[feature] = [0, consensus_counting_male_dict[feature]]
total_count_female = 0
total_count_male = 0
for count_female, count_male in full_consensus_counting_dict.values():
total_count_female += count_female
total_count_male += count_male
full_consensus_ratio_dict = dict()
for feature, counting in full_consensus_counting_dict.items():
female_ratio = 0 if counting[0] == 0 else counting[0]/total_count_female
male_ratio = 0 if counting[1] == 0 else counting[1]/total_count_male
full_consensus_ratio_dict[feature] = [female_ratio, male_ratio]
consensus_counting_df = pd.DataFrame.from_dict(full_consensus_counting_dict)
consensus_counting_df = consensus_counting_df.T
consensus_counting_df.columns = ["Female", "Male"]
consensus_counting_df.index.name = "ID"
consensus_TE_counts_df = consensus_counting_df.copy()
te_family_names = []
te_super_families = []
te_subclasses = []
for te in consensus_TE_counts_df.index :
te_family_name, te_subclass, te_super_family = re.split('#|/', te)
te_family_names.append(te_family_name)
te_super_families.append(te_super_family)
te_subclasses.append(te_subclass)
consensus_TE_counts_df.insert(0, "Family", te_family_names)
consensus_TE_counts_df.insert(1, "SuperFamily", te_super_families)
consensus_TE_counts_df.insert(2, "SubClass", te_subclasses)
def get_TE_mean_subject_coverage(genome_file, bamfile):
feature_list = []
with open(genome_file, 'r') as genome:
for line in genome:
if line.startswith(">"):
feature_list.append(line[1:].strip())
alignments = pysam.AlignmentFile(bamfile)
feature_lengths_dict = {feature_list[i]: alignments.lengths[i] for i in range(len(feature_list))}
TE_subject_coverage = dict()
for feature, feature_length in feature_lengths_dict.items():
ali_iter = alignments.fetch(feature)
subject_coverage_list = []
for ali in ali_iter:
if not ali.is_secondary and not ali.is_supplementary and not ali.is_unmapped:
subject_coverage_list.append(ali.reference_length / feature_length)
if len(subject_coverage_list) == 0 :
TE_subject_coverage[feature] = 0
else:
mean_subject_coverage = sum(subject_coverage_list) / len(subject_coverage_list)
TE_subject_coverage[feature] = mean_subject_coverage
return TE_subject_coverage
female_TE_subject_coverage = get_TE_mean_subject_coverage(consensus_fasta_file, FC30_bamfile)
male_TE_subject_coverage = get_TE_mean_subject_coverage(consensus_fasta_file, FC29_bamfile)
## Adding ratio to big dataframe
female_consensus_ratio_dict = {k:v[0] for k,v in full_consensus_ratio_dict.items()}
male_consensus_ratio_dict = {k:v[1] for k,v in full_consensus_ratio_dict.items()}
consensus_TE_counts_df['Female_Ratio'] = consensus_counting_df.index.to_series().map(female_consensus_ratio_dict)
consensus_TE_counts_df['Male_Ratio'] = consensus_counting_df.index.to_series().map(male_consensus_ratio_dict)
## Adding subject coverage to big dataframe
consensus_TE_counts_df['Female_subject_coverage'] = consensus_counting_df.index.to_series().map(female_TE_subject_coverage)
consensus_TE_counts_df['Male_subject_coverage'] = consensus_counting_df.index.to_series().map(male_TE_subject_coverage)
## Make sure that every columns is in the right order :
consensus_TE_counts_df = consensus_TE_counts_df[['Family', 'SuperFamily', 'SubClass', 'Female', 'Male', 'Female_Ratio', 'Male_Ratio', 'Female_subject_coverage', 'Male_subject_coverage']]
consensus_TE_counts_df.head()
consensus_TE_counts_df.to_csv("consensus_counting.csv")
consensus_ratio_df = pd.DataFrame.from_dict(full_consensus_ratio_dict)
consensus_ratio_df = consensus_ratio_df.T
consensus_ratio_df.columns = ["Female", "Male"]
consensus_ratio_df.index.name = "TE"
consensus_ratio_df.to_csv("consensus_TE_ratio.csv")
consensus_ratio_df = pd.read_csv("consensus_TE_ratio.csv", index_col="TE")
consensus_counting_df = pd.read_csv("consensus_counting.csv", index_col="ID")
consensus_ratio_df.head()
consensus_counting_df.head()
fig = go.Figure()
fig.add_trace(go.Bar(
x=consensus_ratio_df.index,
y=consensus_ratio_df["Female"],
name='Female',
marker_color='indianred'
))
fig.add_trace(go.Bar(
x=consensus_ratio_df.index,
y=consensus_ratio_df["Male"],
name='Male',
marker_color='blue'
))
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=1, uniformtext_mode='hide')
fig.update_layout(barmode='group', xaxis_tickangle=-45, width=1500)
fig.show()
axs = consensus_ratio_df.plot.bar(figsize=(25,6), subplots=True, ylim=(0, 0.25))
df = consensus_counting_df.copy()
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(
go.Pie(labels=df["SubClass"], values=df["Female"], textinfo='label+percent', name = "Female"),
1, 1)
fig.add_trace(
go.Pie(labels=df["SubClass"], values=df["Male"], textinfo='label+percent', name = "Male"),
1, 2)
fig.update_traces(hole=.4, hoverinfo="label+value")
fig.update_layout(
title_text="Ratio of expressed TE per SubClass ",
# Add annotations in the center of the donut pies.
annotations=[dict(text='Female', x=0.16, y=0.5, font_size=20, showarrow=False),
dict(text='Male', x=0.82, y=0.5, font_size=20, showarrow=False)])
fig.show()
df = consensus_counting_df.copy()
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(
go.Pie(labels=df["SuperFamily"], values=df["Female"], textinfo='label', name = "Female"),
1, 1)
fig.add_trace(
go.Pie(labels=df["SuperFamily"], values=df["Male"], textinfo='label', name = "Male"),
1, 2)
fig.update_traces(hole=.4, hoverinfo="label+value+percent")
fig.update_layout(
title_text="Ratio of expressed TE per SuperFamily ",
annotations=[dict(text='Female', x=0.15, y=0.5, font_size=20, showarrow=False),
dict(text='Male', x=0.82, y=0.5, font_size=20, showarrow=False)])
fig.show()
dtale.show(df)
consensus_counting_df.head()
df = consensus_counting_df.copy()
expressed = df["Male"] > 0
df = df[expressed]
fig = px.sunburst(df, path=['SubClass', 'SuperFamily', 'Family'], values='Male', color = "Male_subject_coverage")
fig.show()
df = consensus_counting_df.copy()
expressed = df["Female"] > 0
df = df[expressed]
fig = px.sunburst(df, path=['SubClass', 'SuperFamily', 'Family'], values='Female', color = "Female_subject_coverage")
fig.show()
The query coverage of some TE can be very low, as seen here on IGV for 1731#LTR/Copia :

When we blatted the sequence of read e687c074-c5e2-4c85-9ba0-492e6a4a8a84 (in black on previous picture), this is what we got :

Our read is divided in at least four part :
One part of our read (in pink from 2385 to 4145) is aligned a bunch of time against the duplication 1731 of Copia, in several chromosomes (which is normal for a TE). Another one (in orange from 4 to 1464) is aligned three time to genes coding for a very similar protein (FBtr0081639 = alphaTub84B, which is a gene from chr3R:7086599-7088839, FBtr0081538 =alphaTub84D, another gene from chr3R:7530753-7532732 and FBtr0082087 =alphaTub85E, from chr3R:9731221-9733250).
This read seems to be chimeric : it looks like several reads glued together. To check if it is the case, we could have a look at the different sequence between each part. Maybe it's a redundant sequence that we could find in other reads, in which case it would suggest that it is an adaptator or something... We should align this in-between sequence and check if we can find it elsewhere.
black_read = read_index.find("e687c074-c5e2-4c85-9ba0-492e6a4a8a84")
black_read_sequence = list(black_read)[0].query_sequence
orange2green = black_read_sequence[1464:1500]
green2pink = black_read_sequence[2317:2385]
pink2purple = black_read_sequence[4141:4208]
print(len(orange2green))
print(len(green2pink))
print(len(pink2purple))
print(orange2green)
print(green2pink)
print(pink2purple)
Only pink2purple has a blat result. Let's compare those sequences to another read aligned to copia dup 1731 (7596c024-a22d-48a3-b6e4-ec4d75980c0a)
new_read = read_index.find("7596c024-a22d-48a3-b6e4-ec4d75980c0a")
new_read_sequence = list(new_read)[0].query_sequence
green2pink_2 = new_read_sequence[1502:1575]
pink2orange_2 = new_read_sequence[3362:3419]
print(green2pink_2)
print(pink2orange_2)
The new read can be divided in 3 parts : Copia, gene1 and gene2.

There might be sequences concatenating several reads due to an error in the protocole.
Alignments were generated using minimap2 and the junc-bed option :
minimap2 -ax splice --junc-bed dmgoth101.onecode.fixed.bed -o FC30.against_dmgoth.sam dmgoth101_assembl_chrom.fasta /data/home/ecumunel/cristina_TE/data/reads/FC30.fastq.gz
minimap2 -ax splice --junc-bed dmgoth101.onecode.fixed.bed -o FC29.against_dmgoth.sam dmgoth101_assembl_chrom.fasta /data/home/ecumunel/cristina_TE/data/reads/FC29.fastq.gz
dmgoth101.onecode.fixed.bed file was translated from dmgoth101.onecode.fixed.gtf to bed using the convert2bed tool:
convert2bed --input=gtf < dmgoth101.onecode.fixed.gtf > dmgoth101.onecode.fixed.bed
add_positions_info_to_gff.py : TE copy positions were also added the gtf annotations so that we have a column named "transcript_id" (example : "Copia_LTR_2L_RaGOO_12650_17604";). This column can than be used with FeatureCounts to count TE by position site.
filter_RM_gff.py : Irrelevant features were filtered out of the original annotations.
Used software : FeatureCounts
Global strategy :
FC29_bamfile = "data/dmgoth_data/bam/FC29.against_dmgoth.bam"
FC30_bamfile = "data/dmgoth_data/bam/FC30.against_dmgoth.bam"
FC30_bam_output = "data/dmgoth_data/bam/FC30.against_dmgoth.filtered_max_AS.bam"
FC29_bam_output = "data/dmgoth_data/bam/FC29.against_dmgoth.filtered_max_AS.bam"
# FC30_alignments = pysam.AlignmentFile(FC30_bamfile, 'rb')
# FC30_read_index = pysam.IndexedReads(FC30_alignments)
# FC30_read_index.build()
def filter_only_max_AS_alignment_from_bam(bamfile, new_bamfile):
# We need to browse alignments grouped by read : we'll use pysam IndexedReads structure for that.
alignments = pysam.AlignmentFile(bamfile, 'rb')
read_index = pysam.IndexedReads(alignments)
read_index.build()
# We than need a list of reads name to go through this structure fast
read_names_set = {ali.query_name for ali in alignments}
# Iterating through each read to get max AS alignments (excluding supplementary and unmapped alignments ofc )
with pysam.AlignmentFile(new_bamfile, 'wb', template = alignments) as output:
for read in read_names_set :
read_alignments = list(read_index.find(read))
# Filtering out supp and unmapped alignments
read_alignments = [ali for ali in read_alignments if not ali.is_supplementary and not ali.is_unmapped]
before = len(read_alignments)
if read_alignments :
best_AS = max([ali.get_tag("AS") for ali in read_alignments])
best_alignments = [ali for ali in read_alignments if ali.get_tag("AS") == best_AS]
after = len(best_alignments)
if after != before :
print(read)
break
# outputting best_alignments to new filtered bam
for ali in best_alignments :
output.write(ali)
return new_bamfile
filter_only_max_AS_alignment_from_bam(FC30_bamfile, FC30_bam_output)
filter_only_max_AS_alignment_from_bam(FC29_bamfile, FC29_bam_output)
Using FeatureCounts to generate naive countings from filtered bamfiles
Options used :
For FC30 : 1.0% of alignments were successfully assigned (11009 / 1122989)
For FC29 : 2.1% of alignments were successfully assigned (61524 / 2983866)
Command line :
~/softs/subread-2.0.2-source/bin/featureCounts data/dmgoth_data/bam/FC30.against_dmgoth.filtered_max_AS.bam -a data/dmgoth_data/dmgoth101.onecode.fixed.gtf -g transcript_id -G data/dmgoth_data/dmgoth101_assembl_chrom.fasta -L --fracOverlapFeature 0.1 -M -R CORE -o FC30_counts_with_multimapping
~/softs/subread-2.0.2-source/bin/featureCounts data/dmgoth_data/bam/FC29.against_dmgoth.filtered_max_AS.bam -a data/dmgoth_data/dmgoth101.onecode.fixed.gtf -g transcript_id -G data/dmgoth_data/dmgoth101_assembl_chrom.fasta -L --fracOverlapFeature 0.1 -M -R CORE -o FC29_counts_with_multimapping
files located in data/dmgoth_data/featureCounts_results
Now that we got naive counts, we need to take care of multiple mappings. As said earlier, in case of AS equalities, we do not want to assign the read several time to each positions, but one time to a group a position.
In that instance, we got in the file FC30.against_dmgoth.filtered_max_AS.bam.featureCounts the
FC30_read_assignation_file = "data/dmgoth_data/featureCounts_results/FC30.against_dmgoth.filtered_max_AS.bam.featureCounts"
new_FC30_read_assignation_file = FC30_read_assignation_file + ".filtered"
FC29_read_assignation_file = "data/dmgoth_data/featureCounts_results/FC29.against_dmgoth.filtered_max_AS.bam.featureCounts"
new_FC29_read_assignation_file = FC29_read_assignation_file + ".filtered"
# Filtering unassigned read
def filter_unassigned_read(read_assignation_file, new_read_assignation_file):
with open(read_assignation_file, 'r') as input:
with open(new_read_assignation_file, 'w') as output:
for line in input:
nb_of_target = int(line.split()[2])
if nb_of_target != -1 :
output.write(line)
filter_unassigned_read(FC30_read_assignation_file, new_FC30_read_assignation_file)
filter_unassigned_read(FC29_read_assignation_file, new_FC29_read_assignation_file)
# find reads mapped multiple times and mapped positions
def get_multiple_mapped_reads(read_assignation_file):
multiple_mapped_reads_dict = {}
multiple_mapped_read_names = []
read_list = []
with open(read_assignation_file, 'r') as input:
for line in input:
read = line.split()[0]
if read in read_list :
multiple_mapped_read_names.append(read)
read_list.append(read)
with open(read_assignation_file, 'r') as input:
for line in input:
read = line.split()[0]
position = line.split()[3]
if read in multiple_mapped_read_names :
if read in multiple_mapped_reads_dict.keys() :
multiple_mapped_reads_dict[read].append(position)
else :
multiple_mapped_reads_dict[read] = [position]
return multiple_mapped_reads_dict
FC30_multiple_mapped_reads_dict = get_multiple_mapped_reads(new_FC30_read_assignation_file)
FC29_multiple_mapped_reads_dict = get_multiple_mapped_reads(new_FC29_read_assignation_file)
FC30_countings_file = "data/dmgoth_data/featureCounts_results/FC30_counts_with_multimapping"
FC29_countings_file = "data/dmgoth_data/featureCounts_results/FC29_counts_with_multimapping"
def group_counts_of_multiple_mappings(countings_file, multiple_mapped_reads_dict):
df_counts = pd.read_csv(countings_file, sep="\t", comment= '#')
# count_col = df_counts.columns[-1]
df_counts.columns.values[-1] = "Counts"
df_counts = df_counts[df_counts["Counts"].map(int) > 0] # Removing empty counts
for read, positions in multiple_mapped_reads_dict.items():
row_list = []
positions.sort()
for position in positions :
row_list.append(df_counts.loc[df_counts.Geneid == position])
# Reduce the count of each position multimapped by 1 in the df
df_counts.loc[df_counts.Geneid == position, 'Counts'] -= 1
new_values = [str(i) for i in row_list[0].values[0]]
# creating new row concatenating alignments of multimapped read
for row in row_list[1:]:
for index, val in enumerate(row.values[0]):
new_values[index] += ',' + str(val)
columns = df_counts.columns.values
new_row = dict(zip(columns, new_values))
new_row["Counts"] = 1
# If a row with that same Geneid exist, increment its count by one (positions have been sorted, there cannot be duplicates in a other order)
if new_row["Geneid"] in df_counts.Geneid :
df_counts[df_counts.Geneid == new_row["Geneid"], 'Counts'] += 1
else : # else add it to the df
df_counts = df_counts.append(new_row, ignore_index=True)
# Output new count df
df_counts.to_csv(countings_file + ".grouped_multimap", sep="\t", index = False)
group_counts_of_multiple_mappings(FC30_countings_file, FC30_multiple_mapped_reads_dict)
group_counts_of_multiple_mappings(FC29_countings_file, FC29_multiple_mapped_reads_dict)
# Select one TE copy and plot its countings for each position
new_FC30_countings_file = "data/dmgoth_data/featureCounts_results/FC30_counts_with_multimapping.grouped_multimap"
new_FC29_countings_file = "data/dmgoth_data/featureCounts_results/FC29_counts_with_multimapping.grouped_multimap"
def remove_position_from_Geneid_col(row):
# TODO : Multimapped read are not taken into account in the plot as it add too many categories and break the graph...
# Need a fix, or at least, display the nb of impacted reads somewhere on the graph.
# if len(row.Geneid.split(',')) > 1 :
# TE_copy = row.Geneid.split(row.Chr.split(',')[0])[0][:-1]
# else :
# TE_copy = row.Geneid.split(row.Chr)[0][:-1]
TE_copy = row.Geneid.split(row.Chr)[0][:-1]
row["TE_copy"] = TE_copy
return row
def get_counts_of_TE_copy(TE_name, counts_file):
df_counts = pd.read_csv(counts_file, sep="\t", comment= '#')
new_df_counts = df_counts.apply(remove_position_from_Geneid_col, axis=1)
TE_copy_df = new_df_counts[new_df_counts.TE_copy == TE_name]
return(TE_copy_df)
def plot_TE_expression_profile(TE_name, new_FC30_countings_file, new_FC29_countings_file):
FC30_TE_df = get_counts_of_TE_copy(TE_name, new_FC30_countings_file)
FC29_TE_df = get_counts_of_TE_copy(TE_name, new_FC29_countings_file)
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(
go.Pie(labels=FC30_TE_df["Geneid"], values=FC30_TE_df["Counts"], textinfo='value', name = "Female"),
1, 1)
fig.add_trace(
go.Pie(labels=FC29_TE_df["Geneid"], values=FC29_TE_df["Counts"], textinfo='value', name = "Male"),
1, 2)
fig.update_traces(hole=.4, hoverinfo="label+value")
fig.update_layout(
title_text= TE_name + " expression profile per genome site",
width=1200,
height=500,
# Add annotations in the center of the donut pies.
annotations=[dict(text='Female', x=0.18, y=0.5, font_size=20, showarrow=False),
dict(text='Male', x=0.80, y=0.5, font_size=20, showarrow=False)])
fig.show()
plot_TE_expression_profile("Gypsy12_LTR", new_FC30_countings_file, new_FC29_countings_file)
plot_TE_expression_profile("Mariner2_DM", new_FC30_countings_file, new_FC29_countings_file)
plot_TE_expression_profile("Copia_LTR", new_FC30_countings_file, new_FC29_countings_file)
plot_TE_expression_profile("HMSBEAGLE_I", new_FC30_countings_file, new_FC29_countings_file)
Looks like there's a lot more reads mapping HMSBEAGLE in the alignment made on the genome than on the consensus (for example : 71 for female in genome VS 8 in consensus alignments)
Read 6515311f-5f41-4260-8100-ce3a66237170 : Unmapped in consensus alignement
Read 1382135b-2959-481f-8de6-5ee21ab7ee6f : Unmapped in consensus alignement
Read 5369157e-9445-4a2c-be16-9d7204ceae40 : Unmapped in consensus alignement
Read a6adb986-5cc5-49bf-ba7e-19592c1965d7 : Unmapped in consensus alignement
Read b74f8678-1d8a-4673-ba42-3b397f58c7a1 : Unmapped in consensus alignement
Read 6aa59df7-cc8e-479c-80f7-bfd8a5f734de : Unmapped in consensus alignement
With seaview : looks like there's a lot of difference between consensus sequence and read sequence, mb our lineage diverge a lot in comparison to consensus sequences ?
Pourquoi 545ecddf-e7ed-4e9b-96eb-fb310923c2b1 est présent dans FC29.against_dmgoth.filtered_max_AS.bam en tant que primaire mais pas visible dans IGV ?
def get_reads_mapped_on_feature(feature_counts_res, feature):
FC_df = pd.read_csv(feature_counts_res, sep="\t")
FC_df.columns = ["readID", "status", "nb_ali", "locus"]
reads_mapped_on_feature = FC_df.loc[FC_df['locus'] == feature]
return reads_mapped_on_feature
feature_counts_res = "data/dmgoth_data/featureCounts_results/FC30.against_dmgoth.filtered_max_AS.bam.featureCounts.filtered"
feature = "HMSBEAGLE_I_X_RaGOO_4481038_4481298"
get_reads_mapped_on_feature(feature_counts_res, feature)
def get_reads_intersection_between_consensus_and_genome(consensus_bam_file, genome_featureCounts_res):
with open
consensus_alignment_file = pysam.AlignmentFile(consensus_bam_file, 'rb')
consensus_read_list =
Annotations from repeatmasker contains "C" values in the strand column, meaning the match is with the Complement of the consensus sequence in the database (https://www.repeatmasker.org/webrepeatmaskerhelp.html#reading)
It is not supported by HTSeq, and as the strandness of these sequences doesn't matter to us here, I'll switch all Cs with dots in the new annotations.
For the same reason, I'll change score to "." when it contains multiple instance (ex : "5694/9768/39260/5380").
with open("/home/ericcumunel/Documents/goth101_TE_analysis/data/dmgoth_data/dmgoth101.onecode.fixed.sorted.gtf", "r") as input:
with open("/home/ericcumunel/Documents/goth101_TE_analysis/data/dmgoth_data/dmgoth101.onecode.v2.gtf", "w") as output:
for line in input:
strand = line.split("\t")[6]
if strand == "C":
new_strand = "."
new_list = line.split("\t")[:6] + [new_strand] + line.split("\t")[7:]
new_line = "\t".join(new_list)
line = new_line
score = line.split("\t")[5]
if "/" in score:
new_score = "."
new_list = line.split("\t")[:5] + [new_strand] + line.split("\t")[6:]
new_line = "\t".join(new_list)
line = new_line
output.write(line)
TE_gtf = "/home/ericcumunel/Documents/goth101_TE_analysis/data/dmgoth_data/dmgoth101.onecode.v2.gtf"
dmgoth_fasta = "/home/ericcumunel/Documents/goth101_TE_analysis/data/dmgoth_data/dmgoth101_assembl_chrom.fasta"
new_fasta = "/home/ericcumunel/Documents/goth101_TE_analysis/data/dmgoth_data/dmgoth101_repeatmasker_TE_sequences.fasta"
def generate_fasta_from_gtf(gtf, fasta, output_file_name):
gff_reader = HTSeq.GFF_Reader(gtf)
fasta_sequences = dict( (s.name, s) for s in HTSeq.FastaReader(fasta) )
with open(output_file_name, 'w') as output:
for feature in gff_reader :
# print(feature.iv.strand)
# feat_iv = fasta_sequences[feature.iv.chrom].seq[feature.iv.start:feature.iv.end]
# print(feat_iv)
new_seq = HTSeq.Sequence(fasta_sequences[feature.iv.chrom].seq[feature.iv.start:feature.iv.end], feature.attr["transcript_id"])
new_seq.write_to_fasta_file(output)
# break
generate_fasta_from_gtf(TE_gtf, dmgoth_fasta, new_fasta)
We need to count only reads that are included inside TE (to count independantly expressed TE). Here's a script that can be use to get what is the alignment interval of one read on the genome. This interval can then be compared to TE start and end position to decide wheither its contained or not in the feature.
494S20M1I16M1I5M1D28M6I30M1D46M598N66M1D4M1D13M1D81M1I1M2D5M1D9M1I6M157N29M5D5M75N5M1I75M53N14M2D26M1D5M1D4M3D17M1D25M2D1M1D35M1D40M67N16M1I17M1I24M1D50M1I46M1D90M3D16M1D21M37S
S M I D N
FC30_bamfile = "/home/ericcumunel/Documents/goth101_TE_analysis/data/dmgoth_data/bam/FC30.against_dmgoth_TE_sequences.bam"
# FC30_bamfile = "/home/ericcumunel/Documents/goth101_TE_analysis/data/dmgoth_data/bam/FC30.against_dmgoth.filtered_max_AS.bam"
FC30_bamfile
# def get_alignment_interval(bamfile):
def filter_alignment_included_inside_TE(bamfile, annotations, output_bam, overflow_threshold = 0.1):
with pysam.AlignmentFile(bamfile, 'rb') as alignments:
for ali in alignments :
print(ali.query_name)
print(ali.query_alignment_end)
print(ali.reference_name)
print(ali.reference_start)
print(ali.reference_end)
return ali
break
# ali_iv = get_alignment_interval(ali)
# ali_start, ali_end = ali_iv
# feature = get_alignment_feature(ali)
# feature_iv = get_feature_iv(feature, annotations)
# feature_start, feature_end = feature_iv
# authorized_overflow_length = 0.1 * (feature_end - feature_start)
# start_bound, end_bound = [feature_start - authorized_overflow_length, feature_end - authorized_overflow_length]
# if start_bound <= ali_start <= end_bound and start_bound <= ali_end <= end_bound :
# write_it_then
a = filter_alignment_included_inside_TE(FC30_bamfile, TE_gtf, FC30_bamfile[:-3] + "independantly_expressed_TE.bam")
# print(a)
# def get_alignment_interval(ali):
import plotly
plotly.offline.init_notebook_mode()